final-project

Code
library(tidyverse)
library(knitr)
library(gganimate)
library(kableExtra)
library(gifski)

Data Import

Code
co2 <- read_csv("co2_pcap_cons.csv")
sdi <- read_csv("sdi.csv")

Cleaning

Code
# Convert all columns except 'country' to character to avoid type conflicts
co2_clean <- co2 |>
  mutate(across(-country, as.character))

# Pivot CO2 to long
co2_long <- co2_clean |>
  pivot_longer(-country, names_to = "year", values_to = "co2_per_capita") |>
  mutate(
    year = as.integer(year),
    co2_per_capita = parse_number(co2_per_capita)
  )

# Pivot SDI to long
sdi_long <- sdi |>
  pivot_longer(-country, names_to = "year", values_to = "sdi_score") |>
  mutate(
    year = as.integer(year),
    sdi_score = as.numeric(sdi_score)
  )
Code
# Join and clean
merged_data <- inner_join(co2_long, sdi_long, by = c("country", "year")) |>
  drop_na(co2_per_capita, sdi_score)

merged_data |>
  slice_head(n = 10) |>
  kable()
country year co2_per_capita sdi_score
Afghanistan 1990 0.210 32.5
Afghanistan 1991 0.183 33.1
Afghanistan 1992 0.095 34.0
Afghanistan 1993 0.084 33.5
Afghanistan 1994 0.075 33.0
Afghanistan 1995 0.068 35.6
Afghanistan 1996 0.062 36.0
Afghanistan 1997 0.056 36.6
Afghanistan 1998 0.052 37.1
Afghanistan 1999 0.040 37.5

Data Visualization

Code
# Aggregate: average per country
avg_data <- merged_data |>
  group_by(country) |>
  summarise(
    avg_sdi = mean(sdi_score, na.rm = TRUE),
    avg_co2 = mean(co2_per_capita, na.rm = TRUE)
  )

# Scatter plot
ggplot(avg_data, aes(x = avg_sdi, y = avg_co2)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = lm) +
  labs(
    title = "Average CO₂ Emissions vs. SDI Score (per country)",
    x = "Average SDI Score",
    y = "Average CO₂ Emissions per Capita"
  ) +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

Code
# Animated scatter plot
ggplot(merged_data, aes(x = sdi_score, y = co2_per_capita)) +
  geom_point(aes(color = country), alpha = 0.6, show.legend = FALSE) +
  labs(
    title = "CO₂ Emissions vs. SDI Score Over Time",
    subtitle = "Year: {frame_time}",
    x = "SDI Score",
    y = "CO₂ Emissions per Capita"
  ) +
  transition_time(year) +
  ease_aes('linear') +
  theme_minimal()

Code
# Source: https://gganimate.com/

Fitting Linear Regression Model

Code
my_model <- lm(avg_co2 ~ avg_sdi, 
               data = avg_data
               )

library(broom)
tidy(my_model) |> 
  kable()
term estimate std.error statistic p.value
(Intercept) 15.1448167 1.6694848 9.071551 0
avg_sdi -0.1782317 0.0284985 -6.254076 0

estimated linear regression equation: \[ŷ = 15.14 - 0.18(x)\]

where:

  • ŷ = Estimated average C02 emmissions per capita per year over period: 1990-2019
  • x = Average SDI score over period: 1990-2019

Determining Model Fit

Code
# Variance in the response values (actual y values)
response_var <- var(avg_data$avg_co2)

# Variance in the fitted values (predicted y values)
fitted_var <- var(fitted(my_model))

# Variance in the residuals
residual_var <- var(resid(my_model))

# R² = variance in fitted / variance in response
r_squared <- fitted_var / response_var
Code
model_fit_table <- tibble(
  Metric = c(
    "Variance in Response (CO₂ per capita)",
    "Variance in Fitted Values",
    "Variance in Residuals",
    "R² (Fitted Variance / Total Response Variance)"
  ),
  Value = c(response_var, fitted_var, residual_var, r_squared)
)

model_fit_table |>
  kable("html", digits = 4, caption = "Model Fit Summary") |>
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover"))
Model Fit Summary
Metric Value
Variance in Response (CO₂ per capita) 36.4398
Variance in Fitted Values 7.0870
Variance in Residuals 29.3529
R² (Fitted Variance / Total Response Variance) 0.1945

K-Fold Cross Validation

Code
# Define the R² function for one fold
compute_r2 <- function(fold_num, data, folds) {
  validation_idx <- which(folds == fold_num)
  
  train_data <- data[-validation_idx, ]
  validation_data <- data[validation_idx, ]
  
  model <- lm(avg_co2 ~ avg_sdi, data = train_data)
  predicted <- predict(model, newdata = validation_data)
  actual <- validation_data$avg_co2
  
  r2 <- var(predicted) / var(actual)
  return(r2)
}

# Set number of folds
set.seed(42)
k <- 10
folds <- sample(rep(1:k, length.out = nrow(avg_data)))

# Use map to apply function to each fold
r2_values <- map_dbl(1:k, ~compute_r2(.x, avg_data, folds))

# Output vector of R² values
r2_values
 [1] 0.1090643 0.1368669 0.3100264 0.3906325 0.1748400 0.3132724 0.2218084
 [8] 0.1567636 0.1988303 0.2255776

The variance in the response variable (average CO₂ emissions per capita) was approximately 36.440, while the variance in the fitted values from the regression model was 7.087. This means that about 19.45% of the variability in CO₂ emissions can be explained by a country’s average SDI score from 1990–2019. The remaining 80.55% is unexplained by this model and may be due to other factors such as industrialization, energy policies, or economic structure. While this suggests that SDI has a modest relationship with CO₂ emissions, it is not a very strong predictor on its own.

Code
ggplot(data.frame(R2 = r2_values), aes(x = R2)) +
  geom_histogram(bins = 10, fill = "skyblue", color = "black") +
  geom_vline(aes(xintercept = mean(R2)), color = "red", linetype = "dashed") +
  labs(
    title = "Cross-Validation R² Distribution",
    subtitle = paste0("Mean R²: ", round(mean(r2_values), 3)),
    x = "R² Value",
    y = "Count"
  ) +
  theme_minimal()

The histogram displays the distribution of R^2 values from 10-fold cross-validation, with a mean of approximately 0.224 (red dashed line). This suggests the model explains about 22.4% of the variance in CO₂ emissions using SDI. The spread of values indicates moderate variability across folds, reflecting limited but consistent predictive power. These results suggest the model captures some underlying relationship but may omit other important predictors.

Written Report

Introduction

For our project, we used Gapminder to find our two quantitative variables of our choosing. Those two variables were “CO2 emissions per Capita Consumption based” and the “Sustainable Development Index”. The 1st dataset contains information that tracks how much CO2 “consumption-based emissions generated in the production of goods and services according to where they were consumed, rather than where they were produced” (Our World in Data). The 2nd dataset contains information on the calculation of the development index, which is an efficiency metric made up of 2 other figures known as the “development index” and the “ecological impact index”. This index is to help measure different nations’ efficiency in delivering human development. Meaning how well can a nation maintain its population’s needs while allowing future generations to meet their needs as well. We hypothesize that there is a negative relationship between CO₂ emissions per capita (consumption-based) and the Sustainable Development Index (SDI). Specifically, as CO₂ emissions increase, we expect the SDI to decrease. The strength of this relationship is expected to be moderate to strong, depending on the country’s economic and environmental policies. The rationale behind this hypothesis is that the Sustainable Development Index penalizes nations for exceeding planetary boundaries, particularly in terms of carbon emissions. Countries that consume more and thus emit more CO₂ per capita are likely to perform worse in sustainability metrics, even if their development index is high. An outside source that supports this hypothesis is the United Nations Development Programme’s 2020 Human Development Report, titled “The Next Frontier: Human Development and the Anthropocene.” This report highlights that human development must now be measured by social progress and the ecological constraints within which development occurs. It stresses that exceeding planetary boundaries—such as carbon budgets—undermines long-term sustainability, a core principle reflected in the SDI.

Cleaning:

To prepare the data for analysis, we first converted all columns except for the country column to character type in the CO₂ dataset. This was done to prevent type mismatches during the pivot operation, as some cells may contain non-numeric characters (e.g., commas or units). We then reshaped both datasets from wide to long format, creating a consistent structure with country, year, and one quantitative variable per dataset. After reshaping, we parsed the CO₂ data to ensure it was numeric and cast year columns to integers. Finally, we merged the datasets by country and year, and removed rows containing NA values in either of the two key variables. This may have resulted in the loss of some countries or years from the analysis, potentially biasing our sample toward better-documented regions, but it was necessary to ensure data integrity and allow for a valid comparison between the two metrics.

Linear Regression Analysis

To explore the relationship between CO₂ emissions per capita and Sustainable Development Index (SDI) scores, we began by aggregating the data for each country across all years from 1990 to 2019. This allowed us to create a single observation per country representing long-term averages of each variable. We then visualized the relationship using a scatterplot, placing SDI on the x-axis and CO₂ emissions on the y-axis. The plot includes a linear regression line with a shaded confidence interval, revealing a modest negative relationship—countries with higher average SDI scores tend to have lower average per capita CO₂ emissions.

We used simple linear regression to quantify this relationship. The statistical model is:

\[ \hat{y} = 15.14 - 0.18x \]

where:
- ( \(\hat{y}\) ) is the predicted average CO₂ emissions per capita, and
- ( x ) is the average SDI score.

The slope of the regression line indicates that for every one-point increase in SDI, a country’s average per capita CO₂ emissions are expected to decrease by approximately 0.18 metric tons, holding all else constant.

To assess model fit, we calculated the variance in the response variable (CO₂), the variance in the model’s fitted values, and the variance in the residuals. These are summarized in the table below. The resulting ( R^2 ) value was approximately 0.1945, indicating that around 19.45% of the variability in CO₂ emissions per capita can be explained by differences in SDI. While this suggests a real trend, it also implies that the majority of the variation is likely due to other factors not captured in this model.

References